1 Airbnb Project - Athens

1.1 Introduction

1.2 Summary of key findings

2 Initial data analysis

## Rows: 11,314
## Columns: 106
## $ id                                           <dbl> 10595, 10990, 10993, 1...
## $ listing_url                                  <chr> "https://www.airbnb.co...
## $ scrape_id                                    <dbl> 2.02e+13, 2.02e+13, 2....
## $ last_scraped                                 <date> 2020-06-17, 2020-06-1...
## $ name                                         <chr> "96m2, 3BR, 2BA, Metro...
## $ summary                                      <chr> "Athens Furnished Apar...
## $ space                                        <chr> "Athens Furnished Apar...
## $ description                                  <chr> "Athens Furnished Apar...
## $ experiences_offered                          <chr> "none", "none", "none"...
## $ neighborhood_overview                        <chr> "Ampelokipi district i...
## $ notes                                        <chr> "Although is very easy...
## $ transit                                      <chr> "Note: 5-day ticket fo...
## $ access                                       <chr> "Guest have access to ...
## $ interaction                                  <chr> "-Our reception is 10 ...
## $ house_rules                                  <chr> "- Parties, meetings, ...
## $ thumbnail_url                                <lgl> NA, NA, NA, NA, NA, NA...
## $ medium_url                                   <lgl> NA, NA, NA, NA, NA, NA...
## $ picture_url                                  <chr> "https://a0.muscache.c...
## $ xl_picture_url                               <lgl> NA, NA, NA, NA, NA, NA...
## $ host_id                                      <dbl> 37177, 37177, 37177, 3...
## $ host_url                                     <chr> "https://www.airbnb.co...
## $ host_name                                    <chr> "Emmanouil", "Emmanoui...
## $ host_since                                   <date> 2009-09-08, 2009-09-0...
## $ host_location                                <chr> "Athens, Attica, Greec...
## $ host_about                                   <chr> "Athens Quality Apartm...
## $ host_response_time                           <chr> "within an hour", "wit...
## $ host_response_rate                           <chr> "100%", "100%", "100%"...
## $ host_acceptance_rate                         <chr> "100%", "100%", "100%"...
## $ host_is_superhost                            <lgl> TRUE, TRUE, TRUE, TRUE...
## $ host_thumbnail_url                           <chr> "https://a0.muscache.c...
## $ host_picture_url                             <chr> "https://a0.muscache.c...
## $ host_neighbourhood                           <chr> "Ambelokipi", "Ambelok...
## $ host_listings_count                          <dbl> 6, 6, 6, 6, 6, 2, 1, 2...
## $ host_total_listings_count                    <dbl> 6, 6, 6, 6, 6, 2, 1, 2...
## $ host_verifications                           <chr> "['email', 'phone', 'r...
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE...
## $ host_identity_verified                       <lgl> TRUE, TRUE, TRUE, TRUE...
## $ street                                       <chr> "Athens, Attica, Greec...
## $ neighbourhood                                <chr> "Ambelokipi", "Ambelok...
## $ neighbourhood_cleansed                       <chr> "<U+0391><U+039C><U+03A0><U+0395><U+039B><U+039F><U+039A><U+0397><U+03A0><U+039F><U+0399>", "<U+0391><U+039C><U+03A0><U+0395><U+039B><U+039F>...
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA...
## $ city                                         <chr> "Athens", "Athens", "A...
## $ state                                        <chr> "Attica", "Attica", "A...
## $ zipcode                                      <chr> "11526", "11526", "115...
## $ market                                       <chr> "Athens", "Athens", "A...
## $ smart_location                               <chr> "Athens, Greece", "Ath...
## $ country_code                                 <chr> "GR", "GR", "GR", "GR"...
## $ country                                      <chr> "Greece", "Greece", "G...
## $ latitude                                     <dbl> 38, 38, 38, 38, 38, 38...
## $ longitude                                    <dbl> 23.8, 23.8, 23.8, 23.8...
## $ is_location_exact                            <lgl> TRUE, TRUE, TRUE, TRUE...
## $ property_type                                <chr> "Apartment", "Apartmen...
## $ room_type                                    <chr> "Entire home/apt", "En...
## $ accommodates                                 <dbl> 8, 4, 2, 4, 4, 4, 1, 5...
## $ bathrooms                                    <dbl> 2.0, 1.0, 1.0, 1.0, 1....
## $ bedrooms                                     <dbl> 3, 1, 0, 1, 1, 1, 1, 2...
## $ beds                                         <dbl> 5, 1, 1, 2, 1, 2, 1, 2...
## $ bed_type                                     <chr> "Real Bed", "Real Bed"...
## $ amenities                                    <chr> "{TV,\"Cable TV\",Inte...
## $ square_feet                                  <dbl> 1076, NA, NA, NA, NA, ...
## $ price                                        <chr> "$122.00", "$45.00", "...
## $ weekly_price                                 <chr> "$700.00", "$420.00", ...
## $ monthly_price                                <chr> "$2,800.00", "$1,680.0...
## $ security_deposit                             <chr> "$0.00", "$0.00", "$0....
## $ cleaning_fee                                 <chr> "$25.00", "$15.00", "$...
## $ guests_included                              <dbl> 4, 2, 2, 2, 2, 2, 1, 1...
## $ extra_people                                 <chr> "$13.00", "$5.00", "$0...
## $ minimum_nights                               <dbl> 1, 1, 1, 1, 1, 2, 1, 5...
## $ maximum_nights                               <dbl> 45, 60, 60, 60, 30, 73...
## $ minimum_minimum_nights                       <dbl> 1, 1, 1, 1, 1, 2, 1, 5...
## $ maximum_minimum_nights                       <dbl> 4, 4, 4, 4, 4, 2, 1, 5...
## $ minimum_maximum_nights                       <dbl> 45, 60, 60, 60, 30, 11...
## $ maximum_maximum_nights                       <dbl> 45, 60, 60, 60, 30, 11...
## $ minimum_nights_avg_ntm                       <dbl> 1.3, 1.4, 1.8, 1.5, 1....
## $ maximum_nights_avg_ntm                       <dbl> 45, 60, 60, 60, 30, 11...
## $ calendar_updated                             <chr> "2 weeks ago", "2 week...
## $ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE...
## $ availability_30                              <dbl> 30, 13, 13, 17, 12, 30...
## $ availability_60                              <dbl> 60, 43, 43, 47, 42, 60...
## $ availability_90                              <dbl> 90, 73, 72, 77, 69, 83...
## $ availability_365                             <dbl> 365, 271, 347, 275, 26...
## $ calendar_last_scraped                        <date> 2020-06-17, 2020-06-1...
## $ number_of_reviews                            <dbl> 25, 34, 48, 21, 17, 45...
## $ number_of_reviews_ltm                        <dbl> 5, 3, 1, 2, 1, 31, 0, ...
## $ first_review                                 <date> 2011-05-20, 2012-09-0...
## $ last_review                                  <date> 2020-03-15, 2020-01-0...
## $ review_scores_rating                         <dbl> 97, 98, 97, 96, 95, 96...
## $ review_scores_accuracy                       <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_cleanliness                    <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_checkin                        <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_communication                  <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_location                       <dbl> 9, 10, 10, 9, 9, 10, N...
## $ review_scores_value                          <dbl> 10, 10, 10, 10, 9, 10,...
## $ requires_license                             <lgl> TRUE, TRUE, TRUE, TRUE...
## $ license                                      <chr> "478825", "400315", "4...
## $ jurisdiction_names                           <lgl> NA, NA, NA, NA, NA, NA...
## $ instant_bookable                             <lgl> TRUE, TRUE, TRUE, TRUE...
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, F...
## $ cancellation_policy                          <chr> "moderate", "moderate"...
## $ require_guest_profile_picture                <lgl> FALSE, FALSE, FALSE, F...
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, FALSE, F...
## $ calculated_host_listings_count               <dbl> 6, 6, 6, 6, 6, 2, 1, 2...
## $ calculated_host_listings_count_entire_homes  <dbl> 6, 6, 6, 6, 6, 2, 0, 2...
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 1, 0...
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0...
## $ reviews_per_month                            <dbl> 0.23, 0.36, 0.51, 0.17...

2.1 Cleaning the data

skim(athens_data)
Data summary
Name athens_data
Number of rows 11314
Number of columns 106
_______________________
Column type frequency:
character 47
Date 5
logical 15
numeric 39
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 34 37 0 11314 0
name 7 1.00 1 98 0 11114 0
summary 314 0.97 1 1000 0 10140 0
space 2804 0.75 1 1000 0 7793 0
description 176 0.98 1 1000 0 10600 0
experiences_offered 0 1.00 4 4 0 1 0
neighborhood_overview 3139 0.72 1 1000 0 6611 0
notes 6750 0.40 1 1000 0 3465 0
transit 3403 0.70 1 1000 0 6470 0
access 5447 0.52 1 1000 0 4552 0
interaction 4427 0.61 1 1000 0 5010 0
house_rules 5782 0.49 2 1000 0 4155 0
picture_url 0 1.00 81 146 0 11201 0
host_url 0 1.00 39 43 0 6272 0
host_name 1 1.00 1 33 0 2650 0
host_location 17 1.00 2 94 0 575 0
host_about 4823 0.57 1 4636 0 3036 9
host_response_time 1 1.00 3 18 0 5 0
host_response_rate 1 1.00 2 4 0 47 0
host_acceptance_rate 1 1.00 2 4 0 76 0
host_thumbnail_url 1 1.00 55 106 0 6249 0
host_picture_url 1 1.00 57 109 0 6249 0
host_neighbourhood 1826 0.84 4 29 0 66 0
host_verifications 0 1.00 2 147 0 218 0
street 0 1.00 10 62 0 274 0
neighbourhood 1 1.00 4 17 0 32 0
neighbourhood_cleansed 0 1.00 4 32 0 45 0
city 4 1.00 2 30 0 94 0
state 10261 0.09 1 38 0 132 0
zipcode 262 0.98 5 12 0 212 0
market 126 0.99 6 21 0 2 0
smart_location 0 1.00 10 39 0 102 0
country_code 0 1.00 2 2 0 1 0
country 0 1.00 6 6 0 1 0
property_type 0 1.00 4 23 0 26 0
room_type 0 1.00 10 15 0 4 0
bed_type 0 1.00 5 13 0 5 0
amenities 0 1.00 2 1646 0 10540 0
price 0 1.00 5 9 0 279 0
weekly_price 10790 0.05 6 9 0 148 0
monthly_price 10817 0.04 7 10 0 156 0
security_deposit 2984 0.74 5 9 0 94 0
cleaning_fee 1852 0.84 5 7 0 85 0
extra_people 0 1.00 5 7 0 39 0
calendar_updated 0 1.00 5 13 0 71 0
license 3949 0.65 4 140 0 6735 0
cancellation_policy 0 1.00 8 27 0 6 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2020-06-16 2020-06-18 2020-06-16 3
host_since 1 1.00 2009-09-08 2020-06-11 2017-03-06 2423
calendar_last_scraped 0 1.00 2020-06-16 2020-06-18 2020-06-16 3
first_review 2677 0.76 2010-07-08 2020-06-17 2018-09-07 1954
last_review 2677 0.76 2013-05-23 2020-06-17 2020-01-29 985

Variable type: logical

skim_variable n_missing complete_rate mean count
thumbnail_url 11314 0 NaN :
medium_url 11314 0 NaN :
xl_picture_url 11314 0 NaN :
host_is_superhost 1 1 0.38 FAL: 7034, TRU: 4279
host_has_profile_pic 1 1 1.00 TRU: 11287, FAL: 26
host_identity_verified 1 1 0.19 FAL: 9217, TRU: 2096
neighbourhood_group_cleansed 11314 0 NaN :
is_location_exact 0 1 0.92 TRU: 10370, FAL: 944
has_availability 0 1 1.00 TRU: 11314
requires_license 0 1 1.00 TRU: 11314
jurisdiction_names 11314 0 NaN :
instant_bookable 0 1 0.75 TRU: 8528, FAL: 2786
is_business_travel_ready 0 1 0.00 FAL: 11314
require_guest_profile_picture 0 1 0.01 FAL: 11247, TRU: 67
require_guest_phone_verification 0 1 0.01 FAL: 11201, TRU: 113

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 2.74e+07 1.09e+07 1.06e+04 2.01e+07 2.90e+07 3.60e+07 4.38e+07 ▂▃▆▇▇
scrape_id 0 1.00 2.02e+13 0.00e+00 2.02e+13 2.02e+13 2.02e+13 2.02e+13 2.02e+13 ▁▁▇▁▁
host_id 0 1.00 1.29e+08 9.79e+07 3.72e+04 3.65e+07 1.19e+08 2.12e+08 3.50e+08 ▇▅▃▅▂
host_listings_count 1 1.00 1.68e+01 5.52e+01 0.00e+00 1.00e+00 2.00e+00 9.00e+00 1.12e+03 ▇▁▁▁▁
host_total_listings_count 1 1.00 1.68e+01 5.52e+01 0.00e+00 1.00e+00 2.00e+00 9.00e+00 1.12e+03 ▇▁▁▁▁
latitude 0 1.00 3.80e+01 1.00e-02 3.80e+01 3.80e+01 3.80e+01 3.80e+01 3.80e+01 ▃▇▆▂▁
longitude 0 1.00 2.37e+01 1.00e-02 2.37e+01 2.37e+01 2.37e+01 2.37e+01 2.38e+01 ▁▇▇▂▁
accommodates 0 1.00 3.90e+00 2.04e+00 1.00e+00 2.00e+00 4.00e+00 5.00e+00 3.00e+01 ▇▁▁▁▁
bathrooms 1 1.00 1.21e+00 5.10e-01 0.00e+00 1.00e+00 1.00e+00 1.00e+00 1.20e+01 ▇▁▁▁▁
bedrooms 10 1.00 1.40e+00 8.60e-01 0.00e+00 1.00e+00 1.00e+00 2.00e+00 1.00e+01 ▇▁▁▁▁
beds 46 1.00 2.14e+00 1.52e+00 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+01 ▇▁▁▁▁
square_feet 11226 0.01 6.68e+02 5.47e+02 0.00e+00 2.69e+02 5.92e+02 1.08e+03 2.79e+03 ▇▅▃▁▁
guests_included 0 1.00 1.92e+00 1.25e+00 1.00e+00 1.00e+00 2.00e+00 2.00e+00 1.60e+01 ▇▁▁▁▁
minimum_nights 0 1.00 4.26e+00 2.18e+01 1.00e+00 1.00e+00 2.00e+00 2.00e+00 1.00e+03 ▇▁▁▁▁
maximum_nights 0 1.00 1.63e+03 9.40e+04 1.00e+00 6.20e+01 1.12e+03 1.12e+03 1.00e+07 ▇▁▁▁▁
minimum_minimum_nights 0 1.00 3.96e+00 1.89e+01 1.00e+00 1.00e+00 2.00e+00 2.00e+00 1.00e+03 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 5.36e+00 3.31e+01 1.00e+00 1.00e+00 2.00e+00 2.00e+00 1.00e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 8.80e+02 4.54e+02 1.00e+00 1.00e+03 1.12e+03 1.12e+03 1.00e+04 ▇▁▁▁▁
maximum_maximum_nights 0 1.00 8.90e+02 4.46e+02 1.00e+00 1.12e+03 1.12e+03 1.12e+03 1.00e+04 ▇▁▁▁▁
minimum_nights_avg_ntm 0 1.00 4.37e+00 2.17e+01 1.00e+00 1.00e+00 2.00e+00 2.00e+00 1.00e+03 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 8.84e+02 4.48e+02 1.00e+00 1.03e+03 1.12e+03 1.12e+03 1.00e+04 ▇▁▁▁▁
availability_30 0 1.00 1.92e+01 1.22e+01 0.00e+00 4.00e+00 2.60e+01 3.00e+01 3.00e+01 ▃▁▁▁▇
availability_60 0 1.00 4.00e+01 2.35e+01 0.00e+00 1.80e+01 5.20e+01 5.90e+01 6.00e+01 ▃▁▁▂▇
availability_90 0 1.00 6.16e+01 3.45e+01 0.00e+00 4.10e+01 7.90e+01 8.90e+01 9.00e+01 ▃▁▁▂▇
availability_365 0 1.00 2.32e+02 1.33e+02 0.00e+00 1.21e+02 2.74e+02 3.58e+02 3.65e+02 ▃▂▂▂▇
number_of_reviews 0 1.00 3.52e+01 6.12e+01 0.00e+00 1.00e+00 9.00e+00 4.20e+01 7.11e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 1.11e+01 1.71e+01 0.00e+00 0.00e+00 3.00e+00 1.60e+01 1.57e+02 ▇▁▁▁▁
review_scores_rating 2739 0.76 9.53e+01 7.04e+00 2.00e+01 9.40e+01 9.70e+01 1.00e+02 1.00e+02 ▁▁▁▁▇
review_scores_accuracy 2751 0.76 9.76e+00 6.80e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_cleanliness 2751 0.76 9.66e+00 7.40e-01 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_checkin 2753 0.76 9.85e+00 5.50e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_communication 2752 0.76 9.84e+00 5.70e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_location 2753 0.76 9.57e+00 7.60e-01 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_value 2754 0.76 9.62e+00 7.40e-01 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 9.98e+00 2.14e+01 1.00e+00 1.00e+00 2.00e+00 7.00e+00 1.38e+02 ▇▁▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 7.57e+00 1.52e+01 0.00e+00 1.00e+00 1.00e+00 6.00e+00 8.50e+01 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 2.14e+00 1.27e+01 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.12e+02 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 5.00e-02 6.40e-01 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.20e+01 ▇▁▁▁▁
reviews_per_month 2677 0.76 1.65e+00 1.74e+00 1.00e-02 3.40e-01 1.00e+00 2.42e+00 1.29e+01 ▇▂▁▁▁

Based on our initial data analysis we identified 4 major types of variables in the underlying data set:

  1. Character values: 47
  2. Date values: 5
  3. Logical values: 15
  4. Numeric values: 39

We also have seen that we have 11,314 observations (apartments) & a total of 106 data points per apartment.

2.1.1 Reducing the dataset

We identified many variables that have a characteristic which make it either not interesting to analyze (only one/ very few distinct values, text strings) or that we think we will not use in the analysis later on.

So we excluded these columns/ data points in order to make the data easier & faster to handle.

athens_data_red <- athens_data %>% 
    #Select the relevant variables
  select(
         id,
         neighbourhood,
         zipcode,
         latitude,
         longitude,
         property_type,
         room_type,
         accommodates,
         bathrooms,
         bedrooms,
         beds,
         price,
         weekly_price,
         monthly_price,
         security_deposit,
         cleaning_fee,
         guests_included,
         extra_people,
         minimum_nights,
         maximum_nights,
         availability_365,
         number_of_reviews_ltm,
         review_scores_rating,
         review_scores_checkin,
         review_scores_cleanliness,
         review_scores_accuracy,
         review_scores_communication,
         review_scores_location,
         review_scores_value,
         cancellation_policy,
         reviews_per_month,
         host = host_id, 
         host_response_time,
         host_response_rate,
         host_acceptance_rate,
         host_is_superhost,
         host_listings_count,
         host_total_listings_count,
         host_identity_verified,number_of_reviews,
         host_instant_booking =  instant_bookable
  )

We now only have 41 columns left, which make the data set easier to handle.

2.1.2 Adjust data values

In a next step we will adjust the type of some variables so that we can actually can work with the data more easily.

  • We transform the price, weekly price, monthly price, security deposit, cleaning fee, extra people, host response rate and host acceptance rate from character variables to numeric ones
  • We create factor variables for Property type, room types, cancellation policy and host response time
# Transform character values to numeric values
athens_data_clean <- athens_data_red %>% 
   mutate(
     price = as.numeric(str_remove_all(price, "[$ ,]")),
     weekly_price = as.numeric(str_remove_all(weekly_price, "[$ , ]")),
     monthly_price = as.numeric(str_remove_all(monthly_price, "[$ ,]")),
     cleaning_fee = as.numeric(str_remove_all(cleaning_fee, "[$ ,]")),
     security_deposit = as.numeric(str_remove_all(security_deposit, "[$ ,]")),
     extra_people = as.numeric(str_remove_all(extra_people, "[$ ,]")),
     host_response_rate = as.numeric(str_remove_all(cleaning_fee, "[% ,]")),
     host_acceptance_rate = as.numeric(str_remove_all(cleaning_fee, "[% ,]"))
     )
# Create factor variables for room types 
room_types <- unique(athens_data_clean$room_type)
athens_data_clean$room_type <- factor(athens_data_clean$room_type, labels = room_types)

# Create factor variables for cancellation policies 
cancellation_policies <- unique(athens_data_clean$cancellation_policy)
athens_data_clean$cancellation_policy <- factor(athens_data_clean$cancellation_policy, labels = cancellation_policies)

# Create factor variables for host response time 
athens_data_clean <- athens_data_clean %>% 
  mutate(host_response_time = fct_relevel(host_response_time,
                                            "within an hour", 
                                            "within a few hours",
                                            "within a day",
                                            "a few days or more",
                                            ))

The issue with the property types is that there are to much in order to generate reasonable factors. We need to analyze how much the share of each category. Best case would be that the majority of the property type share is done with a small number. If that is the case we can just summarize the rest in a new category calles "other".

# Identify the amount of each property type
most_com_properties <- athens_data_clean %>%
    count(property_type) %>%
    mutate(percentage = n/sum(n)*100)%>%
    arrange(desc(n))

most_com_properties
## # A tibble: 26 x 3
##    property_type          n percentage
##    <chr>              <int>      <dbl>
##  1 Apartment           9677     85.5  
##  2 House                386      3.41 
##  3 Condominium          261      2.31 
##  4 Serviced apartment   187      1.65 
##  5 Loft                 180      1.59 
##  6 Aparthotel           139      1.23 
##  7 Hotel                135      1.19 
##  8 Boutique hotel       120      1.06 
##  9 Bed and breakfast     49      0.433
## 10 Hostel                38      0.336
## # ... with 16 more rows

As the 5 most common property types account for ~95% of the total share we can just focus on them and summarize the rest in "Others"

# First we need to summarize the other values in the Category "Others"
athens_data_clean <- athens_data_clean %>% 
  mutate(
    property_type = case_when(
      property_type %in% c("Apartment","House", "Condominium","Serviced Apartment", "Loft") 
      ~ property_type, 
      TRUE ~ "Other"))
    

# In a next step we can make a factor out of the 6 pre-defined categories    
athens_data_clean <- athens_data_clean %>% 
  mutate(
     property_type = fct_relevel(property_type,
                                        "Apartment",
                                        "House",
                                        "Condominium",
                                        "Serviced Apartment",
                                        "Loft",
                                        "Other"))

We now have transformed the data types of the most variables in order to make the data set even cleaner. We have deleted unnecessary values, adjusted wrong variable types and now we will further inspect the quality of our data.

2.1.3 Readjust NA values

In a this step we will further manipulate the data set. In specific we will correct the NA values in cases in which we can estimate the value.

  • If no weekly price -> no discount -> we will insert the daily price multiplied by 7
  • If no monthly price -> no discount -> we will insert the daily price multiplied by 30
  • If no security deposit/ cleaning fee -> no fee -> we will insert 0
# We will replace the NAs in the weekly prices and assume there is no discount if NA
 athens_data_clean$weekly_price[is.na(athens_data_clean$weekly_price)] <- 
  athens_data_clean$price *7


# We will replace the NAs in the monthly prices and assume there is no discount if NA
 athens_data_clean$monthly_price[is.na(athens_data_clean$monthly_price)] <- 
  athens_data_clean$price * 30


# We will replace the NAs in the security deposit & cleaning fee and assume 0 if NA
 athens_data_clean$cleaning_fee[is.na(athens_data_clean$cleaning_fee)] <- 0
 athens_data_clean$security_deposit[is.na(athens_data_clean$security_deposit)] <- 0

We now have cleaned the data to a nearly perfect amount. The only thing we haven't yet included are outliers which will be captured in the next paragraph.

2.1.4 Readjust outliers

We will screen the most important variable price, which we need in our analysis later on, for potential outliers. We will exclude the extreme values, which make no sense economically (way too high prices). Reasons which could explain these extremly high prices are unwillingness to list at the moment, fake listings or extremly luxurious apartments.

# Quick plot to see outliers
athens_data_clean %>% 
  ggplot(aes(x = price)) +
  geom_histogram() +
  labs(title= "Distribution of prices in our original data")

# Looks very scewed, probably a log-normal distribution, use log -> normal
athens_data_clean %>% 
  ggplot(aes(x = log(price))) +
  geom_histogram()

# There seem to be a few outliers. We will remove them using the IQR method, becauses we belive that keeping those values would skew our analysis
IQR.outliers <- function(x) {
  Q3 <- quantile(x,0.95)
  Q1 <- quantile(x,0.05)
  IQR <- (Q3-Q1)
  left <- (Q1-(1.5*IQR))
  right <- (Q3+(1.5*IQR))
  print(c(left, right))
  c(x[x <left],x[x>right])
}

# Print outliers
IQR.outliers(athens_data_clean$price)
##   5%  95% 
## -180  352
##  [1]  354  600  459  400  400  385 1000  515  410  640  412  502  650  400  400
## [16]  600  450 1000 1000 1000  495 1000  400  500  500  500  500  500  500  525
## [31]  404  800  700  500  402  450  450  540  360  810  487 7000 7000 7000 7000
## [46]  390  460  400  600  400  353  357  400  426  500 1500  900 1200  450  450
## [61]  800  400  990  600 1000  500 1000 5000  400  800  360 1000  390  500  500
## [76]  400  400  400  600 1290  999 1000  700  720  700  700 1000
athens_data_clean %>% 
  filter(!(price %in% IQR.outliers(athens_data_clean$price))) %>% 
  ggplot(aes(x = log(price))) +
  geom_histogram()
##   5%  95% 
## -180  352

#Defining our final data set, which has no more outliers
athens_data_final <- athens_data_clean %>% 
  filter(!(price %in% IQR.outliers(athens_data_clean$price)))
##   5%  95% 
## -180  352

2.2 First analysis of data

As we now have finally derived with a data set, which has only the relevant values, right variable types, adjusted NA values and is corrected for outliers, we can finally start with the analysis of the data.

# Plot our airbnbs
qmplot(longitude, latitude, data = athens_data_final, color = price)

# Syntagma coordinates
syntagma <- c(37.975344, 23.73472)
names(syntagma) <- c("longitude", "latitude")

# Athene map
athens_map = get_map(location=c(23.68,
                                37.945,
                                23.8,
                                38.035), maptype="terrain-background")

athens_map <- ggmap(athens_map)

# We dont want to see the axis when we are ploting maps
map_theme <-  theme(axis.title.x=element_blank(),
                    axis.text.x=element_blank(),
                    axis.ticks.x=element_blank(),
                    axis.title.y=element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank())

# Plot the map and Syntagma, is there a connection between prices and the centre?
athens_map +
  geom_point(data=athens_data_final, aes(x = longitude, y = latitude, color = price)) +
  geom_point(aes(x = syntagma['latitude'], syntagma['longitude']), 
             color = 'red', size = 5) +
  theme_minimal() +
  map_theme +
  labs(title="Airbnbs around the centre seem to be more expensive", 
       subtitle = "Centre - Syntagma Square")

# Now that we assume that there is a connection, we calculate the distance for each airbnb


head(athens_data_final)
## # A tibble: 6 x 41
##      id neighbourhood zipcode latitude longitude property_type room_type
##   <dbl> <chr>         <chr>      <dbl>     <dbl> <fct>         <fct>    
## 1 10595 Ambelokipi    11526       38.0      23.8 Apartment     Entire h~
## 2 10990 Ambelokipi    11526       38.0      23.8 Apartment     Entire h~
## 3 10993 Ambelokipi    115 26      38.0      23.8 Apartment     Entire h~
## 4 10995 Ambelokipi    11526       38.0      23.8 Apartment     Entire h~
## 5 27262 Ambelokipi    11526       38.0      23.8 Apartment     Entire h~
## 6 28186 Plaka         105 63      38.0      23.7 Loft          Entire h~
## # ... with 34 more variables: accommodates <dbl>, bathrooms <dbl>,
## #   bedrooms <dbl>, beds <dbl>, price <dbl>, weekly_price <dbl>,
## #   monthly_price <dbl>, security_deposit <dbl>, cleaning_fee <dbl>,
## #   guests_included <dbl>, extra_people <dbl>, minimum_nights <dbl>,
## #   maximum_nights <dbl>, availability_365 <dbl>, number_of_reviews_ltm <dbl>,
## #   review_scores_rating <dbl>, review_scores_checkin <dbl>,
## #   review_scores_cleanliness <dbl>, review_scores_accuracy <dbl>,
## #   review_scores_communication <dbl>, review_scores_location <dbl>,
## #   review_scores_value <dbl>, cancellation_policy <fct>,
## #   reviews_per_month <dbl>, host <dbl>, host_response_time <fct>,
## #   host_response_rate <dbl>, host_acceptance_rate <dbl>,
## #   host_is_superhost <lgl>, host_listings_count <dbl>,
## #   host_total_listings_count <dbl>, host_identity_verified <lgl>,
## #   number_of_reviews <dbl>, host_instant_booking <lgl>
# Calculate the distance
athens_data_final<- athens_data_final %>% 
  rowwise() %>% 
  mutate(
    cent_dist = distm(c(latitude, longitude), c(37.975344, 23.73472), 
                      fun = distHaversine)[1,1]
  )

# Test if our numbers are correct visually
athens_map +
  geom_point(data=athens_data_final, aes(x = longitude, y = latitude, color = cent_dist)) +
  geom_point(aes(x = syntagma['latitude'], syntagma['longitude']), color = 'red', size = 5) +
  theme_minimal() +
  map_theme

# How many room types are there?
length(unique(athens_data_final$room_type))
## [1] 4
avg_dist <- athens_data_final %>% 
  group_by(neighbourhood) %>% 
  summarise(
    avg_dist = mean(cent_dist)
  ) %>% 
  arrange(-avg_dist)

athens_data_final %>% 
  filter(!is.na(neighbourhood)) %>% 
  select(neighbourhood,
         room_type) %>% 
  group_by(neighbourhood,
           room_type) %>% 
  summarise(n = n()) %>% 
  mutate(perc = n/sum(n)) %>% 
  ggplot(aes(fill=room_type, x=perc, y=factor(neighbourhood,levels = avg_dist$neighbourhood))) + 
    geom_bar(position="fill", stat="identity") +
  labs(title="Average distance from the centre does not seem to impact room types",
       subtitle = "Average distance in decreasing order") +
  ylab("") +
  xlab("") +
  guides(fill=guide_legend(title="Room types"))

2.3 Model

We will try to find the best fitting model to predict per night prices

athens_data_final %>% 
  na.omit() %>% 
  select_if(is.numeric) %>% 
  cor() %>% 
  as.data.frame() %>% 
  select(price) %>% 
  add_rownames(var = "variable") %>%
  arrange(price) %>% 
  ggplot(aes(x = price, y = reorder(variable, price))) +
  geom_col() +
  ylab("") +
  xlab("Correlation") +
  labs(title = "Distance from central is the most negative correlation",
       subtitle = "Correlations with price")

athens_data_final %>% 
  ggplot(aes(x=cent_dist, y=price)) +
  geom_point()

Using correlations doesnt seem to work too well, we will need to find another way

2.3.1 Possible models

# First we will split our data into a training and testing set
# Set seed so we will get the same results
set.seed(202119)

size <- floor(0.75 * nrow(athens_data_final))
train_ind <- sample(seq_len(nrow(athens_data_final)), size = size)

train <- athens_data_final[train_ind, ]
test <- athens_data_final[-train_ind, ]

OLS

library(stats)

# To choose a model we will use Akaike's information criterion

# Univariate regression

model1 <- lm(log(price) ~ as.factor(accommodates), data=train) 
# Are airbnbs that accomodate 8 people necessarily 2 times as expensive? We do not think so, therefore we use factors instead.

summary(model1) 
## 
## Call:
## lm(formula = log(price) ~ as.factor(accommodates), data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5461 -0.3660 -0.0536  0.2873  2.5607 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.9608     0.0316   93.63   <2e-16 ***
## as.factor(accommodates)2    0.6837     0.0336   20.32   <2e-16 ***
## as.factor(accommodates)3    0.8064     0.0351   22.96   <2e-16 ***
## as.factor(accommodates)4    0.9811     0.0332   29.53   <2e-16 ***
## as.factor(accommodates)5    1.0790     0.0372   28.98   <2e-16 ***
## as.factor(accommodates)6    1.2585     0.0365   34.46   <2e-16 ***
## as.factor(accommodates)7    1.3169     0.0522   25.21   <2e-16 ***
## as.factor(accommodates)8    1.5177     0.0486   31.20   <2e-16 ***
## as.factor(accommodates)9    1.5841     0.0834   18.99   <2e-16 ***
## as.factor(accommodates)10   1.6629     0.0819   20.29   <2e-16 ***
## as.factor(accommodates)11   1.7005     0.1402   12.13   <2e-16 ***
## as.factor(accommodates)12   1.8879     0.1125   16.77   <2e-16 ***
## as.factor(accommodates)13   1.2676     0.2025    6.26    4e-10 ***
## as.factor(accommodates)14   1.8410     0.1402   13.13   <2e-16 ***
## as.factor(accommodates)15   2.0437     0.2025   10.09   <2e-16 ***
## as.factor(accommodates)16   1.9380     0.1049   18.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.529 on 8404 degrees of freedom
## Multiple R-squared:  0.235,  Adjusted R-squared:  0.234 
## F-statistic:  173 on 15 and 8404 DF,  p-value: <2e-16
summary(model1)$r.squared # R2 same as correlation squared, because of univariate model
## [1] 0.235
# cor(log(train$price), train$accommodates)^2

model1 %>% AIC() # 13266
## [1] 13195
lm(log(price) ~ accommodates + bedrooms, data=train)
## 
## Call:
## lm(formula = log(price) ~ accommodates + bedrooms, data = train)
## 
## Coefficients:
##  (Intercept)  accommodates      bedrooms  
##       3.3370        0.1107        0.0838

It would be interesting to use sentiment analysis and use it for pricing

from textblob import TextBlob

testimonial = TextBlob("I love cupcakes.")

print(testimonial.sentiment.polarity)
## 0.5
a = testimonial.sentiment.polarity
py$testimonial
## I love cupcakes.
py$a
## [1] 0.5